Library preperation

library(dplyr)
library(GenomicFeatures)
library(readr)
library(enrichR)
library(DESeq2)
library(ensembldb)
library(biomaRt)
library(clusterProfiler)
library(DOSE)

library(RColorBrewer)
library(gplots)
library(plotly)
library(pheatmap)
library(ggplot2)
library(easylabel)
library(enrichplot)
library(ggrepel)
library(stringr)

library(AnnotationDbi)
library(org.Hs.eg.db)
library(pathview)
library(gage)
library(gageData)
library(tximport)
library(txdbmaker)

Get nnotation data

txdb <- makeTxDbFromGFF("gencode.v46.annotation.gff3")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(txdb, k, "GENEID", "TXNAME")
## 'select()' returned 1:1 mapping between keys and columns
head(tx2gene)
samplesRaw <- read.table("samples.txt",sep='\t', header = TRUE)

Experiment summary & quantification

samples <- read.table("samples.txt",sep='\t', header = TRUE)
to_drop <- c("BIOMATERIAL_PROVIDER","BIOMATERIAL_PROVIDER","Center.Name","DATASTORE.provider", "Consent", "Bytes", "DATASTORE.filetype", "DATASTORE.region", "LibrarySelection", "Collection_Date", "geo_loc_name_country", "geo_loc_name_country_continent", "geo_loc_name","Assay.Type","BioSampleModel","Instrument", "Platform", "Organism", "version", "SRA.Study")
columns_to_keep <- setdiff(names(samples), to_drop)
samples <- samples %>% dplyr::select(all_of(columns_to_keep)) |> filter(LibrarySource =="TRANSCRIPTOMIC")

samples$tissue <- gsub("tibialis anterior \\(TA\\)", "TA", samples$tissue)
samples$tissue <- gsub("vastus lateralis \\(VL\\)", "VL", samples$tissue)

samples$tissue <- as.factor(samples$tissue)
samples$health_state <- as.factor(samples$health_state)
samples$sex <- as.factor(samples$sex)
samples$BulkRNAseq_sequencing_batch <- as.factor(samples$BulkRNAseq_sequencing_batch)
samples_healthy <- samples |> dplyr::filter(health_state == 'healthy')
samples_healthy$isolate <- as.factor(samples_healthy$isolate)

files <- file.path("quants", samples_healthy$Run, "quant.sf")
names(files) <- paste0(samples_healthy$Run)
txi.healthy <- tximport(files, type = "salmon", tx2gene = tx2gene)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 
## summarizing abundance
## summarizing counts
## summarizing length
head(txi.healthy$counts,3)
##                    SRR24750870 SRR24750871 SRR24750872 SRR24750873 SRR24750874
## ENSG00000000003.16      37.000      54.000      39.000      53.000      47.000
## ENSG00000000005.6       26.000       2.000       1.000       3.000       0.000
## ENSG00000000419.14     277.001     303.999     308.001     339.484     390.999
##                    SRR24750875 SRR24750879 SRR24750883 SRR24750884 SRR24750885
## ENSG00000000003.16     120.190          65      85.000         104      90.000
## ENSG00000000005.6        8.000           3       0.000           0       0.000
## ENSG00000000419.14     902.406         356     503.001         465     617.911
##                    SRR24750886 SRR24750887 SRR24750888 SRR24750889 SRR24750890
## ENSG00000000003.16      71.000      62.000     102.042       54.00      59.000
## ENSG00000000005.6        4.000       4.000       4.000        6.00       4.000
## ENSG00000000419.14     572.387     588.999     666.009      705.59     351.855
##                    SRR24750891 SRR24750892 SRR24750893 SRR24750894 SRR24750895
## ENSG00000000003.16       64.00      43.000      80.999      38.000      66.000
## ENSG00000000005.6         1.00       0.000       5.000       0.000       1.000
## ENSG00000000419.14      567.08     330.999     316.001     368.015     322.999
##                    SRR24750896 SRR24750897 SRR24750898 SRR24750899 SRR24750900
## ENSG00000000003.16          58      97.000      36.000      85.001          58
## ENSG00000000005.6            0      11.000       3.000       4.000           3
## ENSG00000000419.14         322     302.001     348.001     422.999         276
##                    SRR24750901 SRR24750902
## ENSG00000000003.16      42.000      37.000
## ENSG00000000005.6        0.000      46.000
## ENSG00000000419.14     296.982     284.343

Run DESeq2

samples_healthy$BulkRNAseq_RNA_integrity_number <- scale(samples_healthy$BulkRNAseq_RNA_integrity_number)

ddsTxi <- DESeqDataSetFromTximport(txi.healthy,
                                   colData = samples_healthy,
                                   design = ~ tissue  + BulkRNAseq_sequencing_batch + isolate)
## using counts and average transcript lengths from tximport
keep <- rowSums(counts(ddsTxi)) > 27
ddsTxi <- ddsTxi[keep,]
dds <- DESeq(ddsTxi)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast= c('tissue','TA',"VL"))
saveRDS(dds, 'dds_helathy.RDS')
summary(res)
## 
## out of 24834 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1829, 7.4%
## LFC < 0 (down)     : 1845, 7.4%
## outliers [1]       : 0, 0%
## low counts [2]     : 6259, 25%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res <- res[order(-abs(res$log2FoldChange)), ]
resDF <- as.data.frame(res)
resDF <- dplyr::as_tibble(resDF, rownames = "ensembl_gene_id")

resFiltered <- resDF %>% dplyr::filter(abs(log2FoldChange)>1, padj<0.05)

Graphical inspection

plotDispEsts(dds, main="Dispersion plot")

rld <- rlogTransformation(dds)
head(assay(rld))
##                    SRR24750870 SRR24750871 SRR24750872 SRR24750873 SRR24750874
## ENSG00000000003.16    5.917535    5.828125    5.665265    5.953147    5.928074
## ENSG00000000005.6     2.760746    1.814801    1.738242    1.886832    1.670663
## ENSG00000000419.14    8.313336    8.446128    8.476905    8.570571    8.660874
## ENSG00000000457.14    7.309331    7.472447    7.559360    7.837606    7.284328
## ENSG00000000460.17    5.660049    5.066636    4.927018    5.000241    5.538527
## ENSG00000000938.13    5.033313    5.381299    5.690622    5.700456    6.275428
##                    SRR24750875 SRR24750879 SRR24750883 SRR24750884 SRR24750885
## ENSG00000000003.16    5.859484    6.058560    6.274450    6.327279    5.819210
## ENSG00000000005.6     2.000234    1.868700    1.684107    1.670603    1.666329
## ENSG00000000419.14    9.015589    8.604115    8.960194    8.722805    8.868794
## ENSG00000000457.14    7.747022    7.541706    7.860924    7.970687    7.769912
## ENSG00000000460.17    5.466104    5.270120    5.597530    5.555343    5.680307
## ENSG00000000938.13    5.849431    5.102436    5.775555    5.117404    5.248318
##                    SRR24750886 SRR24750887 SRR24750888 SRR24750889 SRR24750890
## ENSG00000000003.16    5.791106    5.761190    5.834337    5.571565    6.064573
## ENSG00000000005.6     2.113572    1.953646    1.857564    2.033945    1.915987
## ENSG00000000419.14    8.927451    9.157233    8.872940    9.216153    8.461380
## ENSG00000000457.14    7.612505    7.582885    7.737253    7.941017    7.404776
## ENSG00000000460.17    5.207095    5.674224    5.690186    5.621894    5.153109
## ENSG00000000938.13    7.404563    4.785715    5.036387    5.296468    5.381870
##                    SRR24750891 SRR24750892 SRR24750893 SRR24750894 SRR24750895
## ENSG00000000003.16    5.739619    6.028900    6.300341    5.637114    6.145888
## ENSG00000000005.6     1.740251    1.679606    1.960789    1.677105    1.733698
## ENSG00000000419.14    9.047236    8.609687    8.373867    8.631808    8.417076
## ENSG00000000457.14    7.521993    7.699256    7.608436    7.670385    7.539165
## ENSG00000000460.17    5.758669    5.720892    5.307104    5.467281    5.823575
## ENSG00000000938.13    4.835364    4.839947    5.439222    4.677890    5.477796
##                    SRR24750896 SRR24750897 SRR24750898 SRR24750899 SRR24750900
## ENSG00000000003.16    5.868366    6.079529    5.775011    6.174295    6.176189
## ENSG00000000005.6     1.670717    2.351702    1.894241    1.921822    1.871424
## ENSG00000000419.14    8.587082    8.348925    8.674711    8.638459    8.338870
## ENSG00000000457.14    7.623633    7.465805    8.003973    7.633382    7.497142
## ENSG00000000460.17    5.784000    5.611291    5.932256    4.968874    6.281634
## ENSG00000000938.13    5.676657    5.907561    6.096818    5.774717    4.993957
##                    SRR24750901 SRR24750902
## ENSG00000000003.16    6.045202    5.708710
## ENSG00000000005.6     1.672514    3.226686
## ENSG00000000419.14    8.409216    8.406539
## ENSG00000000457.14    7.601665    7.734756
## ENSG00000000460.17    5.482304    5.276328
## ENSG00000000938.13    5.124617    5.336447
mycols <- brewer.pal(8, "Dark2")[1:length(unique(samples_healthy$tissue))]
mycols2 <- brewer.pal(8, "Set3")[1:length(unique(samples_healthy$sex))]

sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[samples_healthy$tissue],
          RowSideColors=mycols2[samples_healthy$sex],
          margin=c(10, 10), main="Sample Distance Matrix")
legend(x=-0.03,y=1.1, legend = levels(samples_healthy$tissue), fill = mycols, title = "Categories",bty='n')
legend(x=0.07,y=1.1, legend = levels(samples_healthy$sex), fill = mycols2, title = "Categories",bty='n')

df <- as.data.frame(colData(dds)[,c("tissue", "sex")])

pheatmap(as.matrix(sampleDists), annotation_col=df, annotation_row = df)

selected <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("tissue", "sex")])
pheatmap(assay(rld)[selected,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=FALSE, annotation_col=df)

DESeq2::plotPCA(rld, intgroup=c("tissue"))
## using ntop=500 top features by variance

pcaData <- plotPCA(rld, intgroup=c("tissue",'sex'), returnData=TRUE)
## using ntop=500 top features by variance
pcaData$isolate <- rld@colData@listData$isolate
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=tissue, shape=sex, label= str_sub(isolate, -2, -1))) +
  geom_point(size=3) +
    geom_path(aes(group = str_sub(isolate, -2, -1))) +
  geom_text_repel(aes(label=str_sub(isolate, -2, -1)), size=3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) + 
  coord_fixed() +
  theme_minimal()

# hist(res$pvalue, breaks=50, col="grey")
# DESeq2::plotMA(dds, ylim=c(-1,1))
# 
# # Volcano plot
# with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
# with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
fig <- plot_ly(x = res$pvalue, type = "histogram")
fig <- fig %>% layout(title = "Histogram of p-values",
                     xaxis = list(title = "Value"),
                     yaxis = list(title = "Frequency"))
fig
easyMAplot(res, output_shiny=FALSE)
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
## Ensembl site unresponsive, trying useast mirror
res <- res[order(-abs(res$log2FoldChange)), ]
resDF <- as.data.frame(res)
resFiltered <- resDF %>% dplyr::filter(padj<0.05)
universe <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id_version", tx2gene$GENEID, ensembl)

resFiltered$ensembl_gene_id <- gsub("\\.\\d+$", "", row.names(resFiltered))
gene_info <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id", resFiltered$ensembl_gene_id, ensembl) |> group_by(ensembl_gene_id) |> dplyr::slice_head(n = 1) |>  ungroup()
resFiltered <- resFiltered %>%
  left_join(gene_info, by = "ensembl_gene_id")
top_peaks <- as.data.frame(head(resFiltered,10)) 
a <- list()
for (i in seq_len(nrow(top_peaks))) {
  m <- top_peaks[i, ]
  annot = ifelse(m[["hgnc_symbol"]]=="", "novel", m[["hgnc_symbol"]])
  ax <- sample(c(-1, 1), 1) * runif(1, 5, 60)
  ay <- sample(c(-1, 1), 1) * runif(1, 5, 40)
  a[[i]] <- list(
    x = m[["log2FoldChange"]],
    y = -log10(m[["pvalue"]]),
    text = annot,
    xref = "x",
    yref = "y",
    showarrow = TRUE,
    arrowhead = 0.5,
    ax = ax,
    ay = ay
  )
}
easyVolcano(res, fccut = 2, output_shiny=FALSE) %>%   layout(annotations = a)
data(kegg.sets.hs)
data(sigmet.idx.hs)
kegg.sets.hs <- kegg.sets.hs[sigmet.idx.hs]
foldchanges <- resFiltered$log2FoldChange
names(foldchanges) <- resFiltered$entrezgene_id
keggres <- gage(foldchanges, gsets=kegg.sets.hs, same.dir=FALSE)
lapply(keggres, head)
## $greater
##                                                p.geomean stat.mean     p.val
## hsa04350 TGF-beta signaling pathway            0.1768113 0.9381318 0.1768113
## hsa04973 Carbohydrate digestion and absorption 0.2252355 0.7681066 0.2252355
## hsa00564 Glycerophospholipid metabolism        0.2594676 0.6578153 0.2594676
## hsa04020 Calcium signaling pathway             0.2710911 0.6117936 0.2710911
## hsa04530 Tight junction                        0.2840787 0.5733486 0.2840787
## hsa04610 Complement and coagulation cascades   0.3597654 0.3664711 0.3597654
##                                                    q.val set.size      exp1
## hsa04350 TGF-beta signaling pathway            0.9999842       22 0.1768113
## hsa04973 Carbohydrate digestion and absorption 0.9999842       13 0.2252355
## hsa00564 Glycerophospholipid metabolism        0.9999842       12 0.2594676
## hsa04020 Calcium signaling pathway             0.9999842       48 0.2710911
## hsa04530 Tight junction                        0.9999842       38 0.2840787
## hsa04610 Complement and coagulation cascades   0.9999842       10 0.3597654
## 
## $stats
##                                                stat.mean      exp1
## hsa04350 TGF-beta signaling pathway            0.9381318 0.9381318
## hsa04973 Carbohydrate digestion and absorption 0.7681066 0.7681066
## hsa00564 Glycerophospholipid metabolism        0.6578153 0.6578153
## hsa04020 Calcium signaling pathway             0.6117936 0.6117936
## hsa04530 Tight junction                        0.5733486 0.5733486
## hsa04610 Complement and coagulation cascades   0.3664711 0.3664711
keggrespathwaysUp <- data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tibble::as_tibble() %>%
  dplyr::filter(row_number()<=6) %>%
  .$id %>%
  as.character()
keggrespathwaysUp
## [1] "hsa04350 TGF-beta signaling pathway"           
## [2] "hsa04973 Carbohydrate digestion and absorption"
## [3] "hsa00564 Glycerophospholipid metabolism"       
## [4] "hsa04020 Calcium signaling pathway"            
## [5] "hsa04530 Tight junction"                       
## [6] "hsa04610 Complement and coagulation cascades"
keggresids <- substr(c(keggrespathwaysUp), start=1, stop=8)
keggresids
## [1] "hsa04350" "hsa04973" "hsa00564" "hsa04020" "hsa04530" "hsa04610"
plot_pathway <- function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa", new.signature=FALSE)
#detach("package:dplyr", unload=T)
tmp <- sapply(keggresids, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa"))
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04350.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04973.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa00564.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04020.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04530.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory D:/VLTA
## Info: Writing image file hsa04610.pathview.png
resDF <- resDF %>% mutate(diffexpressed = dplyr::case_when(
  log2FoldChange > 0 & padj < 0.05 ~ 'UP',
  log2FoldChange < 0 & padj < 0.05 ~ 'DOWN',
  padj > 0.05 ~ 'NO'
))
original_gene_list <- resDF$log2FoldChange
names(original_gene_list) <- gsub("\\.\\d+$", "", row.names(resDF))
gene_list<-na.omit(original_gene_list)
gene_list = sort(gene_list, decreasing = TRUE)
gse <- gseGO(geneList=gene_list, 
             ont ="ALL", 
             keyType = "ENSEMBL", 
             minGSSize = 3, 
             maxGSSize = 800, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             OrgDb = org.Hs.eg.db, 
             pAdjustMethod = "none")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
dotplot(gse, showCategory=10, font.size=7, split=".sign") + facet_grid(.~.sign)

#emapplot(gse, showCategory = 10)
websiteLive <- getOption("enrichR.live")
if (websiteLive) {
    listEnrichrSites()
    setEnrichrSite("Enrichr") # Human genes   
}
## Enrichr ... Connection is Live!
## FlyEnrichr ... Connection is Live!
## WormEnrichr ... Connection is Live!
## YeastEnrichr ... Connection is Live!
## FishEnrichr ... Connection is Live!
## OxEnrichr ... Connection is Live!
## Connection changed to https://maayanlab.cloud/Enrichr/
## Connection is Live!
if (websiteLive) dbs <- listEnrichrDbs()
dbs <- c("GO_Molecular_Function_2023", "GO_Cellular_Component_2023", "GO_Biological_Process_2023","GO_Molecular_Function_2018", "GO_Cellular_Component_2018", "GO_Biological_Process_2018")
if (websiteLive) {
    enriched <- enrichr(names(gene_list), dbs)
}
## Uploading data to Enrichr... Done.
##   Querying GO_Molecular_Function_2023... Done.
##   Querying GO_Cellular_Component_2023... Done.
##   Querying GO_Biological_Process_2023... Done.
##   Querying GO_Molecular_Function_2018... Done.
##   Querying GO_Cellular_Component_2018... Done.
##   Querying GO_Biological_Process_2018... Done.
## Parsing results... Done.
df <- resDF |> tibble::rownames_to_column("ensembl_gene_id") |> dplyr::filter(padj<0.05)
up_regulated <- df |> dplyr::filter(log2FoldChange > 0) |>   dplyr::mutate(ensembl_gene_id = gsub("\\.\\d+$", "", ensembl_gene_id)) |> dplyr::select(ensembl_gene_id)
write.table(up_regulated, 'up_ta_vl.txt', row.names = FALSE, col.names = FALSE, quote = FALSE)
down_regulated <- df |> dplyr::filter(log2FoldChange < 0) |>   dplyr::mutate(ensembl_gene_id = gsub("\\.\\d+$", "", ensembl_gene_id)) |> dplyr::select(ensembl_gene_id)
write.table(down_regulated, 'down_ta_vl.txt', row.names = FALSE, col.names = FALSE, quote = FALSE)
# Choose databases
dbs <- c("GO_Biological_Process_2021", "KEGG_2021_Human", "WikiPathways_2019_Human", "Jensen_DISEASES")
genes <- names(gene_list)
# Run enrichment analysis 
enriched_results <- enrichr(genes, dbs)
## Uploading data to Enrichr... Done.
##   Querying GO_Biological_Process_2021... Done.
##   Querying KEGG_2021_Human... Done.
##   Querying WikiPathways_2019_Human... Done.
##   Querying Jensen_DISEASES... Done.
## Parsing results... Done.
# Access results for a specific database
go_results <- enriched_results[["GO_Biological_Process_2021"]]
kegg_results <- enriched_results[["KEGG_2021_Human"]]
head(kegg_results)
head(go_results)
genes <- names(gene_list)
unvierse1 <- gsub("\\.\\d+$", "", row.names(resDF))
unvierse2 <- getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id","description","entrezgene_description"), "ensembl_gene_id", unvierse1, ensembl) |> group_by(ensembl_gene_id) |> dplyr::slice_head(n = 1) |>  ungroup()
universe3 <- unvierse2$entrezgene_id
ego <- enrichGO(gene          = as.character(gene_info$entrezgene_id),
                universe      = as.character(universe3),
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05,
                readable      = TRUE)
head(ego)
egoDF <- ego@result
write.table(egoDF, 'BP_TA_VL.csv')
egoCC <- enrichGO(gene          = as.character(gene_info$entrezgene_id),
                #universe      = as.character(universe$entrezgene_id),
                OrgDb         = org.Hs.eg.db,
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.1,
                readable      = TRUE)
egoCCDF <- egoCC@result
write.table(egoCCDF, 'CC_TA_VL.csv')
ggo <- groupGO(gene     = as.character(gene_info$entrezgene_id),
               OrgDb    = org.Hs.eg.db,
               ont      = "BP",
               level    = 4,
               readable = TRUE)
head(ggo)
gggo <- ggo@result
w <- egoCCDF %>% filter(pvalue < 0.05) %>% select(c("ID","pvalue"))
write.table(w, "table2.tsv",quote = FALSE, row.names = FALSE)